In [3]:
# !pip install powerbiclient
# #pip install ipywidgets
In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.express as px
In [5]:
data = pd.read_csv(r"C:\Users\Sunny\Downloads\shopping_trends.csv")
In [6]:
#data.to_csv("path" ) // to convert dataframe in to file
In [7]:
data
Out[7]:
| Customer ID | Age | Gender | Item Purchased | Category | Purchase Amount (USD) | Location | Size | Color | Season | Review Rating | Subscription Status | Payment Method | Shipping Type | Discount Applied | Promo Code Used | Previous Purchases | Preferred Payment Method | Frequency of Purchases | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 55 | Male | Blouse | Clothing | 53 | Kentucky | L | Gray | Winter | 3.1 | Yes | Credit Card | Express | Yes | Yes | 14 | Venmo | Fortnightly |
| 1 | 2 | 19 | Male | Sweater | Clothing | 64 | Maine | L | Maroon | Winter | 3.1 | Yes | Bank Transfer | Express | Yes | Yes | 2 | Cash | Fortnightly |
| 2 | 3 | 50 | Male | Jeans | Clothing | 73 | Massachusetts | S | Maroon | Spring | 3.1 | Yes | Cash | Free Shipping | Yes | Yes | 23 | Credit Card | Weekly |
| 3 | 4 | 21 | Male | Sandals | Footwear | 90 | Rhode Island | M | Maroon | Spring | 3.5 | Yes | PayPal | Next Day Air | Yes | Yes | 49 | PayPal | Weekly |
| 4 | 5 | 45 | Male | Blouse | Clothing | 49 | Oregon | M | Turquoise | Spring | 2.7 | Yes | Cash | Free Shipping | Yes | Yes | 31 | PayPal | Annually |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3895 | 3896 | 40 | Female | Hoodie | Clothing | 28 | Virginia | L | Turquoise | Summer | 4.2 | No | Cash | 2-Day Shipping | No | No | 32 | Venmo | Weekly |
| 3896 | 3897 | 52 | Female | Backpack | Accessories | 49 | Iowa | L | White | Spring | 4.5 | No | PayPal | Store Pickup | No | No | 41 | Bank Transfer | Bi-Weekly |
| 3897 | 3898 | 46 | Female | Belt | Accessories | 33 | New Jersey | L | Green | Spring | 2.9 | No | Credit Card | Standard | No | No | 24 | Venmo | Quarterly |
| 3898 | 3899 | 44 | Female | Shoes | Footwear | 77 | Minnesota | S | Brown | Summer | 3.8 | No | PayPal | Express | No | No | 24 | Venmo | Weekly |
| 3899 | 3900 | 52 | Female | Handbag | Accessories | 81 | California | M | Beige | Spring | 3.1 | No | Bank Transfer | Store Pickup | No | No | 33 | Venmo | Quarterly |
3900 rows × 19 columns
In [8]:
data.sample(4) # randomely fetch the information
Out[8]:
| Customer ID | Age | Gender | Item Purchased | Category | Purchase Amount (USD) | Location | Size | Color | Season | Review Rating | Subscription Status | Payment Method | Shipping Type | Discount Applied | Promo Code Used | Previous Purchases | Preferred Payment Method | Frequency of Purchases | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2276 | 2277 | 31 | Male | Jeans | Clothing | 82 | Wyoming | L | Red | Summer | 4.1 | No | Bank Transfer | 2-Day Shipping | No | No | 25 | Debit Card | Weekly |
| 2368 | 2369 | 29 | Male | Sunglasses | Accessories | 38 | New Jersey | M | Lavender | Summer | 3.3 | No | Credit Card | Standard | No | No | 6 | Debit Card | Every 3 Months |
| 2684 | 2685 | 46 | Female | Dress | Clothing | 45 | Washington | M | Gray | Summer | 4.7 | No | Venmo | Free Shipping | No | No | 42 | Credit Card | Bi-Weekly |
| 2851 | 2852 | 58 | Female | Blouse | Clothing | 73 | Nevada | L | Teal | Summer | 3.0 | No | Bank Transfer | Store Pickup | No | No | 42 | Debit Card | Monthly |
In [9]:
data.describe()
Out[9]:
| Customer ID | Age | Purchase Amount (USD) | Review Rating | Previous Purchases | |
|---|---|---|---|---|---|
| count | 3900.000000 | 3900.000000 | 3900.000000 | 3900.000000 | 3900.000000 |
| mean | 1950.500000 | 44.068462 | 59.764359 | 3.749949 | 25.351538 |
| std | 1125.977353 | 15.207589 | 23.685392 | 0.716223 | 14.447125 |
| min | 1.000000 | 18.000000 | 20.000000 | 2.500000 | 1.000000 |
| 25% | 975.750000 | 31.000000 | 39.000000 | 3.100000 | 13.000000 |
| 50% | 1950.500000 | 44.000000 | 60.000000 | 3.700000 | 25.000000 |
| 75% | 2925.250000 | 57.000000 | 81.000000 | 4.400000 | 38.000000 |
| max | 3900.000000 | 70.000000 | 100.000000 | 5.000000 | 50.000000 |
In [10]:
data.describe(include="object")
Out[10]:
| Gender | Item Purchased | Category | Location | Size | Color | Season | Subscription Status | Payment Method | Shipping Type | Discount Applied | Promo Code Used | Preferred Payment Method | Frequency of Purchases | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 3900 | 3900 | 3900 | 3900 | 3900 | 3900 | 3900 | 3900 | 3900 | 3900 | 3900 | 3900 | 3900 | 3900 |
| unique | 2 | 25 | 4 | 50 | 4 | 25 | 4 | 2 | 6 | 6 | 2 | 2 | 6 | 7 |
| top | Male | Blouse | Clothing | Montana | M | Olive | Spring | No | Credit Card | Free Shipping | No | No | PayPal | Every 3 Months |
| freq | 2652 | 171 | 1737 | 96 | 1755 | 177 | 999 | 2847 | 696 | 675 | 2223 | 2223 | 677 | 584 |
In [11]:
data.shape
Out[11]:
(3900, 19)
In [12]:
shop_dup = data[data.duplicated()]
shop_dup
Out[12]:
| Customer ID | Age | Gender | Item Purchased | Category | Purchase Amount (USD) | Location | Size | Color | Season | Review Rating | Subscription Status | Payment Method | Shipping Type | Discount Applied | Promo Code Used | Previous Purchases | Preferred Payment Method | Frequency of Purchases |
|---|
In [13]:
data.dtypes
Out[13]:
Customer ID int64 Age int64 Gender object Item Purchased object Category object Purchase Amount (USD) int64 Location object Size object Color object Season object Review Rating float64 Subscription Status object Payment Method object Shipping Type object Discount Applied object Promo Code Used object Previous Purchases int64 Preferred Payment Method object Frequency of Purchases object dtype: object
In [14]:
data.columns
Out[14]:
Index(['Customer ID', 'Age', 'Gender', 'Item Purchased', 'Category',
'Purchase Amount (USD)', 'Location', 'Size', 'Color', 'Season',
'Review Rating', 'Subscription Status', 'Payment Method',
'Shipping Type', 'Discount Applied', 'Promo Code Used',
'Previous Purchases', 'Preferred Payment Method',
'Frequency of Purchases'],
dtype='object')
In [15]:
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3900 entries, 0 to 3899 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Customer ID 3900 non-null int64 1 Age 3900 non-null int64 2 Gender 3900 non-null object 3 Item Purchased 3900 non-null object 4 Category 3900 non-null object 5 Purchase Amount (USD) 3900 non-null int64 6 Location 3900 non-null object 7 Size 3900 non-null object 8 Color 3900 non-null object 9 Season 3900 non-null object 10 Review Rating 3900 non-null float64 11 Subscription Status 3900 non-null object 12 Payment Method 3900 non-null object 13 Shipping Type 3900 non-null object 14 Discount Applied 3900 non-null object 15 Promo Code Used 3900 non-null object 16 Previous Purchases 3900 non-null int64 17 Preferred Payment Method 3900 non-null object 18 Frequency of Purchases 3900 non-null object dtypes: float64(1), int64(4), object(14) memory usage: 579.0+ KB
In [16]:
data.isnull().sum()
Out[16]:
Customer ID 0 Age 0 Gender 0 Item Purchased 0 Category 0 Purchase Amount (USD) 0 Location 0 Size 0 Color 0 Season 0 Review Rating 0 Subscription Status 0 Payment Method 0 Shipping Type 0 Discount Applied 0 Promo Code Used 0 Previous Purchases 0 Preferred Payment Method 0 Frequency of Purchases 0 dtype: int64
In [17]:
print(f"The unique values of the 'Gender' column are :{data['Gender'] .unique()}")
The unique values of the 'Gender' column are :['Male' 'Female']
In [18]:
print(f"The unique values of the 'Item Purchased' column are :{data['Item Purchased'] .unique()}")
The unique values of the 'Item Purchased' column are :['Blouse' 'Sweater' 'Jeans' 'Sandals' 'Sneakers' 'Shirt' 'Shorts' 'Coat' 'Handbag' 'Shoes' 'Dress' 'Skirt' 'Sunglasses' 'Pants' 'Jacket' 'Hoodie' 'Jewelry' 'T-shirt' 'Scarf' 'Hat' 'Socks' 'Backpack' 'Belt' 'Boots' 'Gloves']
In [19]:
print(f"The unique values of the 'Location' column are :{data['Location'] .unique()}")
print()
The unique values of the 'Location' column are :['Kentucky' 'Maine' 'Massachusetts' 'Rhode Island' 'Oregon' 'Wyoming' 'Montana' 'Louisiana' 'West Virginia' 'Missouri' 'Arkansas' 'Hawaii' 'Delaware' 'New Hampshire' 'New York' 'Alabama' 'Mississippi' 'North Carolina' 'California' 'Oklahoma' 'Florida' 'Texas' 'Nevada' 'Kansas' 'Colorado' 'North Dakota' 'Illinois' 'Indiana' 'Arizona' 'Alaska' 'Tennessee' 'Ohio' 'New Jersey' 'Maryland' 'Vermont' 'New Mexico' 'South Carolina' 'Idaho' 'Pennsylvania' 'Connecticut' 'Utah' 'Virginia' 'Georgia' 'Nebraska' 'Iowa' 'South Dakota' 'Minnesota' 'Washington' 'Wisconsin' 'Michigan']
In [20]:
print(f"The unique values of the 'Category' column are :{data['Category'] .unique()}")
print()
print(f"The unique values of the 'Size ' column are :{data['Size'] .unique()}")
print()
print(f"The unique values of the 'Subscription Status ' column are :{data['Subscription Status'] .unique()}")
print()
print(f"The unique values of the 'Payment Method' column are :{data['Payment Method'] .unique()}")
print()
print(f"The unique values of the 'Shipping Type' column are :{data['Shipping Type'] .unique()}")
print()
print(f"The unique values of the 'Discount Applied ' column are :{data['Discount Applied'] .unique()}")
print()
print(f"The unique values of the 'Promo Code Used ' column are :{data['Promo Code Used'] .unique()}")
print()
print(f"The unique values of the 'Preferred Payment Method ' column are :{data['Preferred Payment Method'] .unique()}")
The unique values of the 'Category' column are :['Clothing' 'Footwear' 'Outerwear' 'Accessories'] The unique values of the 'Size ' column are :['L' 'S' 'M' 'XL'] The unique values of the 'Subscription Status ' column are :['Yes' 'No'] The unique values of the 'Payment Method' column are :['Credit Card' 'Bank Transfer' 'Cash' 'PayPal' 'Venmo' 'Debit Card'] The unique values of the 'Shipping Type' column are :['Express' 'Free Shipping' 'Next Day Air' 'Standard' '2-Day Shipping' 'Store Pickup'] The unique values of the 'Discount Applied ' column are :['Yes' 'No'] The unique values of the 'Promo Code Used ' column are :['Yes' 'No'] The unique values of the 'Preferred Payment Method ' column are :['Venmo' 'Cash' 'Credit Card' 'PayPal' 'Bank Transfer' 'Debit Card']
In [21]:
data['Age'].value_counts() # how many counts based on age
Out[21]:
Age 69 88 57 87 41 86 25 85 49 84 50 83 54 83 27 83 62 83 32 82 19 81 58 81 42 80 43 79 28 79 31 79 37 77 46 76 29 76 68 75 59 75 63 75 56 74 36 74 55 73 52 73 64 73 35 72 51 72 65 72 40 72 45 72 47 71 66 71 30 71 23 71 38 70 53 70 18 69 21 69 26 69 34 68 48 68 24 68 39 68 70 67 22 66 61 65 60 65 33 63 20 62 67 54 44 51 Name: count, dtype: int64
In [22]:
data['Age'].mean()
Out[22]:
44.06846153846154
In [ ]:
Descriptive Analysis¶
In [23]:
# analysis past data to understand trends and patterns.
In [24]:
shop =data['Gender'].value_counts()
shop.plot(kind = 'pie', explode=(0,0.1),autopct = '%1.1f%%')
plt.xlabel("Gender")
Out[24]:
Text(0.5, 0, 'Gender')
In [25]:
#data['Gender'].value_counts().plot(kind ='bar'
In [26]:
data['Age_category'] = pd.cut(data['Age'],bins=[0,15,18,30,50,70],labels=['child' , 'teen',
'Multi-aged adults',
'young Adult','old'])
In [27]:
fig = px.histogram(data , y='Age' , x ='Age_category')
fig.show()
How does the average purchase amount vary accross different product ?¶
In [28]:
data['Category'].unique()
Out[28]:
array(['Clothing', 'Footwear', 'Outerwear', 'Accessories'], dtype=object)
In [29]:
data.groupby('Category')['Purchase Amount (USD)'].mean()
Out[29]:
Category Accessories 59.838710 Clothing 60.025331 Footwear 60.255426 Outerwear 57.172840 Name: Purchase Amount (USD), dtype: float64
In [30]:
plt.figure(figsize=(20,6))
data1 = data['Category'].value_counts()
explode =[0.1]*len(data1)
data1.plot(kind ='pie',explode =explode,autopct= '%1.1f%%')
plt.xlabel('Category')
plt.legend()
plt.show()
Which Gender has the highest number of purchases ??¶
In [31]:
sns.barplot(data , x='Gender' , y= 'Purchase Amount (USD)')
Out[31]:
<Axes: xlabel='Gender', ylabel='Purchase Amount (USD)'>
Seasons or months where customer spending is significantly higher¶
In [32]:
# seasons sales
data2 = data['Season'].value_counts()
data2
Out[32]:
Season Spring 999 Fall 975 Winter 971 Summer 955 Name: count, dtype: int64
In [33]:
plt.figure(figsize=(20,6))
data1 = data['Season'].value_counts()
explode =[0.1]*len(data1)
data1.plot(kind ='pie',explode =explode,autopct= '%1.1f%%')
plt.xlabel('Category')
plt.legend()
plt.show()
Payment MEthod is the most used by customers¶
In [34]:
data.groupby('Payment Method')['Purchase Amount (USD)'].mean().sort_values(ascending=False)
Out[34]:
Payment Method Venmo 61.241960 Credit Card 61.159483 Cash 59.927469 Bank Transfer 58.738924 PayPal 58.697492 Debit Card 58.638231 Name: Purchase Amount (USD), dtype: float64
In [35]:
sns.barplot(x="Payment Method",y='Purchase Amount (USD)',data =data)
Out[35]:
<Axes: xlabel='Payment Method', ylabel='Purchase Amount (USD)'>
What are the most commonly purchased items in each category ??¶
In [36]:
data.groupby('Category')['Item Purchased'].value_counts()
Out[36]:
Category Item Purchased
Accessories Jewelry 171
Belt 161
Sunglasses 161
Scarf 157
Hat 154
Handbag 153
Backpack 143
Gloves 140
Clothing Blouse 171
Pants 171
Shirt 169
Dress 166
Sweater 164
Socks 159
Skirt 158
Shorts 157
Hoodie 151
T-shirt 147
Jeans 124
Footwear Sandals 160
Shoes 150
Sneakers 145
Boots 144
Outerwear Jacket 163
Coat 161
Name: count, dtype: int64
In [37]:
fig = px.histogram(data, x= 'Item Purchased', color= 'Category')
fig.show()
In [38]:
# Extract Correlation Features
In [39]:
num_fea = data[['Age','Purchase Amount (USD)','Review Rating','Previous Purchases']]
num_fea.head()
Out[39]:
| Age | Purchase Amount (USD) | Review Rating | Previous Purchases | |
|---|---|---|---|---|
| 0 | 55 | 53 | 3.1 | 14 |
| 1 | 19 | 64 | 3.1 | 2 |
| 2 | 50 | 73 | 3.1 | 23 |
| 3 | 21 | 90 | 3.5 | 49 |
| 4 | 45 | 49 | 2.7 | 31 |
In [40]:
corr = num_fea.corr()
corr
Out[40]:
| Age | Purchase Amount (USD) | Review Rating | Previous Purchases | |
|---|---|---|---|---|
| Age | 1.000000 | -0.010424 | -0.021949 | 0.040445 |
| Purchase Amount (USD) | -0.010424 | 1.000000 | 0.030776 | 0.008063 |
| Review Rating | -0.021949 | 0.030776 | 1.000000 | 0.004229 |
| Previous Purchases | 0.040445 | 0.008063 | 0.004229 | 1.000000 |
In [41]:
#plt.figure(figsize(5,3))
sns.heatmap(corr,annot=True,cmap ='BuPu')
Out[41]:
<Axes: >
In [42]:
In [43]:
In [ ]:
In [45]:
In [ ]:
In [48]:
In [49]:
In [52]:
In [ ]: